{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CV: Feature Dropout"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Try eliminating a random subset of features to check for possible overfitting."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import lightgbm as lgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedKFold"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Model-specific parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NUM_FOLDS = 5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Search-specific parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DROPOUT_FEATURE_FRACTION = 0.2\n",
    "NUM_IMPORTANCE_BINS = 5\n",
    "NUM_SEARCH_ITERATIONS = 50"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Make subsequent runs consistent and reproducible."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RANDOM_SEED = 100500\n",
    "np.random.seed(RANDOM_SEED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_lists = [\n",
    "    'simple_summaries',\n",
    "    'jaccard_ngrams',\n",
    "    'fuzzy',\n",
    "    'tfidf',\n",
    "    'lda',\n",
    "    'nlp_tags',\n",
    "    'wordnet_similarity',\n",
    "    'phrase_embedding',\n",
    "    'wmd',\n",
    "    'wm_intersect',\n",
    "    \n",
    "    '3rdparty_abhishek',\n",
    "    '3rdparty_dasolmar_whq',\n",
    "    '3rdparty_mephistopheies',\n",
    "    '3rdparty_image_similarity',\n",
    "    \n",
    "    'magic_pagerank',\n",
    "    'magic_frequencies',\n",
    "    'magic_cooccurrence_matrix',\n",
    "    \n",
    "    'oofp_nn_mlp_with_magic',\n",
    "    'oofp_nn_cnn_with_magic',\n",
    "    'oofp_nn_bi_lstm_with_magic',\n",
    "    'oofp_nn_siamese_lstm_attention',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_train, df_test, _ = project.load_feature_lists(feature_lists)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = kg.io.load(project.features_dir + 'y_train.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute dropout probabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gbm_importances = {\n",
    "    # Place prior feature importances here\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "imps = pd.DataFrame(\n",
    "    [[feature, importance] for feature, importance in gbm_importances.items()],\n",
    "    columns=['feature', 'importance'],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "imps['importance_bin'] = pd.cut(imps['importance'], NUM_IMPORTANCE_BINS, labels=list(range(1, NUM_IMPORTANCE_BINS + 1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "importance_bin = dict(zip(imps['feature'], imps['importance_bin']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dropout_probs = np.array([\n",
    "    1 / importance_bin.get(feature_name, NUM_IMPORTANCE_BINS // 2 + 1)\n",
    "    for feature_name in df_train.columns.tolist()\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Normalize so that the vector sums up to 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dropout_probs *= (1 / np.sum(dropout_probs))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run the search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def run_experiment(dropout_feature_list):\n",
    "    X_train = df_train.drop(dropout_feature_list, axis=1).values\n",
    "    \n",
    "    kfold = StratifiedKFold(\n",
    "        n_splits=NUM_FOLDS,\n",
    "        shuffle=True,\n",
    "        random_state=RANDOM_SEED\n",
    "    )\n",
    "\n",
    "    experiment_scores = []\n",
    "\n",
    "    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):\n",
    "        X_fold_train = X_train[ix_train]\n",
    "        X_fold_val = X_train[ix_val]\n",
    "\n",
    "        y_fold_train = y_train[ix_train]\n",
    "        y_fold_val = y_train[ix_val]\n",
    "\n",
    "        lgb_params = {\n",
    "            'objective': 'binary',\n",
    "            'metric': 'binary_logloss',\n",
    "            'boosting': 'gbdt',\n",
    "            'device': 'cpu',\n",
    "            'feature_fraction': 0.5,\n",
    "            'num_leaves': 64,\n",
    "            'learning_rate': 0.03,\n",
    "            'num_boost_round': 3000,\n",
    "            'early_stopping_rounds': 5,\n",
    "            'verbose': 1,\n",
    "            'bagging_fraction_seed': RANDOM_SEED,\n",
    "            'feature_fraction_seed': RANDOM_SEED,\n",
    "        }\n",
    "\n",
    "        lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)\n",
    "        lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    \n",
    "        evals_result = {}\n",
    "\n",
    "        model = lgb.train(\n",
    "            lgb_params,\n",
    "            lgb_data_train,\n",
    "            valid_sets=[lgb_data_train, lgb_data_val],\n",
    "            evals_result=evals_result,\n",
    "            num_boost_round=lgb_params['num_boost_round'],\n",
    "            early_stopping_rounds=lgb_params['early_stopping_rounds'],\n",
    "            verbose_eval=False,\n",
    "        )\n",
    "\n",
    "        fold_train_scores = evals_result['training'][lgb_params['metric']]\n",
    "        fold_val_scores = evals_result['valid_1'][lgb_params['metric']]\n",
    "\n",
    "        experiment_scores.append([\n",
    "            fold_train_scores[-1],\n",
    "            fold_val_scores[-1],\n",
    "        ])\n",
    "\n",
    "    # Compute final scores.\n",
    "    final_experiment_score = np.mean(np.array(experiment_scores), axis=0)\n",
    "    \n",
    "    # Clean up.\n",
    "    del X_train\n",
    "    del model\n",
    "    gc.collect()\n",
    "    \n",
    "    return [\n",
    "        dropout_feature_list,\n",
    "        final_experiment_score[0],\n",
    "        final_experiment_score[1],\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "all_experiments_log = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for i in range(NUM_SEARCH_ITERATIONS):\n",
    "    print(f'Iteration {i + 1} of {NUM_SEARCH_ITERATIONS}')\n",
    "    \n",
    "    dropout_list = np.random.choice(\n",
    "        df_train.columns,\n",
    "        size=int(len(df_train.columns) * DROPOUT_FEATURE_FRACTION),\n",
    "        replace=False,\n",
    "        p=dropout_probs,\n",
    "    )\n",
    "    \n",
    "    print(f'Removing {dropout_list}')\n",
    "    experiment_result = run_experiment(dropout_list)\n",
    "    _, result_train, result_val = experiment_result\n",
    "    \n",
    "    print(f'Train: {result_train:.6f}   Val: {result_val:.6f}   Diff: {result_val - result_train:.6f}')\n",
    "    all_experiments_log.append(experiment_result)\n",
    "    pd.DataFrame(all_experiments_log).to_csv(project.temp_dir + 'dropout_experiments.log', index=None)\n",
    "    \n",
    "    print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
